#####################################
###                               ###
###   Women and Party Building    ###
###                               ###
### code_wrangling_MemberGender.R ###
###                               ###
#####################################

# This script cleans the raw party membership datasets and builds the aggregate membership datasets

rm( list=ls() )
library(genderBR)
library(stringr)
library(lubridate)


cat("Building the party member datasets. This will take a while.\n")

# This line uses the rstudioapi package to set the working directory to the same folder where this script is saved.
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))

# Alternatively, you can use setwd( PATH ) to set this directory manually



### Step 1: First pass through the membership data, coding member gender and other member characteristics
rm( list=ls() )
member.files <- list.files("../1_data/raw/membership_data/")
cat("Coding member gender and other personal characteristics...\nEstimated time: 5-10 minutes\n")
pb <- txtProgressBar(min = 0, max = length(member.files), style=3, initial = 0) 
progress <- 0
for(file in member.files){
  
  party <- str_remove(str_remove(file, ".rds"), "data_raw_filiados_")
  data <- readRDS(paste("../1_data/raw/membership_data/", file, sep=""))
  data <- data[, c("NUMERO_DA_INSCRICAO", "NOME_DO_FILIADO_REDUCED", "SIGLA_DO_PARTIDO", "UF", "CODIGO_DO_MUNICIPIO", "CODIGO_DO_MUNICIPIO", "ZONA_ELEITORAL", "SECAO_ELEITORAL", "DATA_DA_FILIACAO", "DATA_DA_DESFILIACAO", "DATA_DO_CANCELAMENTO")]
  
  data$Year <- as.numeric(substring(data$DATA_DA_FILIACAO, 7, 10))
  data$First.Name <- data$NOME_DO_FILIADO_REDUCED
  
  # Use get_gender() to code gender base on first name, and extract observations that were not coded
  data$Gender.Prob <- get_gender(data$First.Name, prob=T)
  uncoded <- data[is.na(data$Gender.Prob),]
  data <- data[!is.na(data$Gender.Prob),]
  
  # Now try again after getting rid of accents and other special characters
  unwanted_array = list(    'Š'='S', 'š'='s', 'Ž'='Z', 'ž'='z', 'À'='A', 'Á'='A', 'Â'='A', 'Ã'='A', 'Ä'='A', 'Å'='A', 'Æ'='A', 'Ç'='C', 'È'='E', 'É'='E',
                            'Ê'='E', 'Ë'='E', 'Ì'='I', 'Í'='I', 'Î'='I', 'Ï'='I',  'Ò'='O', 'Ó'='O', 'Ô'='O', 'Õ'='O', 'Ö'='O', 'Ø'='O', 'Ù'='U',
                            'Ú'='U', 'Û'='U', 'Ü'='U', 'Ý'='Y', 'Þ'='B', 'ß'='Ss', 'à'='a', 'á'='a', 'â'='a', 'ã'='a', 'ä'='a', 'å'='a', 'æ'='a', 'ç'='c',
                            'è'='e', 'é'='e', 'ê'='e', 'ë'='e', 'ì'='i', 'í'='i', 'î'='i', 'ï'='i', 'ð'='o', 'ò'='o', 'ó'='o', 'ô'='o', 'õ'='o',
                            'ö'='o', 'ø'='o', 'ù'='u', 'ú'='u', 'û'='u', 'ý'='y', 'ý'='y', 'þ'='b', 'ÿ'='y' )
  uncoded$First.Name <- chartr(paste(names(unwanted_array), collapse=''),
                               paste(unwanted_array, collapse=''),
                               uncoded$First.Name)
  uncoded$Gender.Prob <- get_gender(uncoded$First.Name, prob=T)
  data <- rbind(data, uncoded[!is.na(uncoded$Gender.Prob),])
  uncoded <- uncoded[is.na(uncoded$Gender.Prob),]
  
  # Finally, use the last letter of the first name for observations that still haven't been coded
  last.letter <- substring(uncoded$First.Name, nchar(uncoded$First.Name))
  uncoded$Gender <- ifelse(last.letter=="A", "F", ifelse(last.letter=="O", "M", NA))
  
  data$Gender <- ifelse(data$Gender.Prob > .5, "F", ifelse(data$Gender.Prob < .5, "M", NA))
  data <- rbind(data, uncoded)
  
  # Remove duplicated members (some Brazilian parties have carried out voluntary mass re-registrations of their members over the years,
  #                            and this leads to duplicate entries in the TSE's dataset. We want only the earliest entry for each member
  #                            in a given party)
  data$Joined <- as.Date(data$DATA_DA_FILIACAO, "%d/%m/%Y")
  data <- data[data$Joined > as.Date("1980-01-01"),]
  data <- data[as.numeric(data$Year) <= 2020,] 
  data <- data[order(data$Joined),]
  data <- data[!duplicated(data$NUMERO_DA_INSCRICAO),]
  
  names(data) <- c("ID", "Name", "Party", "State", "MunCode", "Municipality", "Zone", "Section", "Date", "Disaffiliated", "Canceled", "Year", "First.Name", "Gender.Prob", "Gender", "Joined")
  
  data$Term <- ifelse(data$Year %in% 2017:2020, 2017, ifelse(data$Year %in% 2013:2016, 2013, ifelse(data$Year %in% 2009:2012, 2009, ifelse(data$Year %in% 2005:2008, 2005, ifelse(data$Year %in% 2001:2004, 2001, ifelse(data$Year %in% 1997:2000, 1997, ifelse(data$Year %in% 1993:1996, 1993, ifelse(data$Year %in% 1989:1992, 1989, ifelse(data$Year %in% 1986:1988, 1986, ifelse(data$Year %in% 1983:1985, 1983, NA))))))))))
  
  saveRDS(data, paste("../1_data/cleaned/membership_data/data_cleaned_filiados_", party, ".rds", sep=""))
  rm(data)
  gc(F)
  progress <- progress + 1
  setTxtProgressBar(pb, progress)
}
close(pb)




### Step 2: Identify which members switched parties

# Create list of members of all parties for checking party switches
rm( list=ls() )
member.files <- list.files("../1_data/cleaned/membership_data/")
member.list <- list()
cat("Creating a list of members across all parties...\nEstimated time: 1-3 minutes\n")
pb <- txtProgressBar(min = 0, max = length(member.files), style=3, initial = 0) 
progress <- 0
for(file in member.files){
  
  setTxtProgressBar(pb, progress)
  
  data <- readRDS(paste("../1_data/cleaned/membership_data/", file, sep=""))
  
  data <- data[as.numeric(data$Year) <= 2016,]
  
  member.list[[str_remove(str_remove(file, ".rds"), "data_cleaned_filiados_")]] <- data$ID
  progress <- progress + 1
}
close(pb)

# Identify party switchers
members <- unlist(member.list) # Converts from list to vector
dups <- unique(members[duplicated(members)]) # Identifies IDs for members who appear in more than one party

# Now go back through the cleaned membership data to extract data on those members who switched parties
member.files <- list.files("../1_data/cleaned/membership_data/")
switchers <- c()
cat("Getting additional data on members who switched parties...\nEstimated time: 10-20 minutes\n")
pb <- txtProgressBar(min = 0, max = length(member.files), style=3, initial = 0) 
progress <- 0
for(file in member.files){
  
  data <- readRDS(paste("../1_data/cleaned/membership_data/", file, sep=""))
  switched <- data[data$ID %in% dups & data$Year <= 2016,]
  switchers <- rbind(switchers, switched)
  gc(F)
  progress <- progress + 1
  setTxtProgressBar(pb, progress)
}
close(pb)
saveRDS(switchers, "../1_data/cleaned/data_cleaned_Switchers.rds") # Saving this now because it will be used again in code_wrangling_Supplementary.R 
                                                 # to construct the elite party-switching dataset


# Identify order of switching
# (This code will sort the data by member ID and date of affiliation, and then it will compare each 
# observation to the next observation in order to identify the new party and new date of affiliation for switchers)
cat("Sorting party-switching data...\n")
rm( list=ls() )
switchers <- readRDS("../1_data/cleaned/data_cleaned_Switchers.rds")
switchers <- switchers[switchers$Year >= 2000,]
switchers <- switchers[switchers$ID %in% switchers$ID[duplicated(switchers$ID)],]
switchers <- switchers[order(switchers$ID, switchers$Joined),] # Sorts the data frame by personal ID, and then by date of affiliation
n <- nrow(switchers)
switchers$Switched.Party <- NA
switchers$Switched.Date <- ""
id <- c(switchers$ID[2:n], NA)
switchers$Switched.Party[1:(n-1)] <- switchers$Party[2:n] 
switchers$Switched.Date[1:(n-1)] <- switchers$Date[2:n] 
switchers$Switched.Party[switchers$ID != id] <- NA # For when the next observation is a different person
switchers$Switched.Date[switchers$ID != id] <- NA
duplicates <- switchers[switchers$Date == switchers$Switched.Date,]
duplicates <- duplicates[!is.na(duplicates$ID),]
switchers <- switchers[!(switchers$ID %in% duplicates$ID),]
switchers$Disaffiliated <- NA
switchers$Canceled <- NA




# Finally, add this data on party switching to cleaned membership datasets
# (Note: this will overwrite the old data_cleaned_filiados_ files)
member.files <- list.files("../1_data/cleaned/membership_data/")
cat("Adding information on party-switching to the cleaned party membership dataset...\nEstimated time: 5-15 minutes\n")
pb <- txtProgressBar(min = 0, max = length(member.files), style=3, initial = 0) 
progress <- 0
for(file in member.files){
  
  party <-  str_remove(str_remove(file, ".rds"), "data_cleaned_filiados_")
  data <- readRDS(paste("../1_data/cleaned/membership_data/", file, sep=""))
  data$Switched.Party <- NA
  data$Switched.Date <- ""
  data <- data[!(data$ID %in% switchers$ID),]
  data <- data[!(data$ID %in% duplicates$ID),]
  data <- rbind(data, switchers[switchers$Party == party,])
  saveRDS(data, paste("../1_data/cleaned/membership_data/", file, sep=""))
  rm(data)
  gc(F)
  progress <- progress + 1
  setTxtProgressBar(pb, progress)
}
close(pb)




### Step 3: Build aggregated datasets


# This first dataset is at the party-muncipality-term level, and it will be used to build the main RD datasets

# This will use the data on the municipal electorate to normalize recruitment by the number of male and female
# voters living in the municipality
rm( list=ls() )
electorate <- readRDS("../1_data/raw/data_raw_electorate.rds")
names(electorate) <- c("PERIODO", "MUNICIPIO", "MunCode", "Voters", "Female.Voters", "Male.Voters")
electorate <- electorate[electorate$PERIODO %in% c(2000, 2004, 2008, 2012, 2016),]

# Iterate over parties
member.files <- list.files("../1_data/cleaned/membership_data/")
merged.data <- c()
cat("Aggregating membership data to the municipality-party-term level...\nEstimated time: 5-10 minutes\n")
pb <- txtProgressBar(min = 0, max = length(member.files), style=3, initial = 0) 
progress <- 0
for(file in member.files){
  
  party <- str_remove(str_remove(file, ".rds"), "data_cleaned_filiados_")
  data <- readRDS(paste("../1_data/cleaned/membership_data/", file, sep=""))
  data$PERIODO <- data$Term - 1
  data <- data[data$Term > 2000,]
  
  # Count number of Female and Male recruits who joined during that term
  data$Recruits <- 1
  data$Female.Recruits <- as.numeric(data$Gender == "F")
  data$Male.Recruits <- as.numeric(data$Gender == "M")
  
  # Code party switching by gender
  data$Switch.Year <- as.numeric(substring(data$Switched.Date, nchar(data$Switched.Date)-3, nchar(data$Switched.Date)))
  data$Switched <- data$Switched.Date != "" 
  data$Switched.Female <- data$Switched & data$Gender == "F"
  data$Switched.Male <- data$Switched & data$Gender == "M"
  data$Disaffiliate.Year <- as.numeric(substring(data$Disaffiliated, nchar(data$Disaffiliated)-3, nchar(data$Disaffiliated)))
  data$Left <- !is.na(data$Disaffiliate.Year) & data$Disaffiliate.Year > data$Year
  data$Left.Female <- data$Left & data$Gender == "F"
  data$Left.Male <- data$Left & data$Gender == "M"
  
  # Aggregate to the municipality-party-term level
  data <- aggregate(cbind(Recruits, Female.Recruits, Male.Recruits,  Switched, Switched.Female, Switched.Male, Left, Left.Female, Left.Male) ~ Party + State + MunCode + Municipality + PERIODO, data=data, FUN="sum")
  data <- merge(electorate, data, by=c("PERIODO", "MunCode"), all.x=T)
  data$Recruits[is.na(data$Recruits)] <- 0 # municipality-terms that did not appear in the members dataset for that party are assumed to be cases where nobody from that municipality joined during that term
  data$Female.Recruits[is.na(data$Female.Recruits)] <- 0
  data$Male.Recruits[is.na(data$Male.Recruits)] <- 0
  
  # Calculate recruits as a share of all of the voters of that same gender (measured as recruits for 1000 registered voters)
  data$Recruit.Share <- 1000*data$Recruits / data$Voters
  data$Female.Recruit.Share <- 1000*data$Female.Recruits / data$Female.Voters
  data$Male.Recruit.Share <- 1000*data$Male.Recruits / data$Male.Voters
  
  data$Party <- party
  
  merged.data <- rbind(merged.data, data)
  rm(data)
  gc(F)
  progress <- progress + 1
  setTxtProgressBar(pb, progress)
}
close(pb)
saveRDS(merged.data, "../1_data/cleaned/data_cleaned_membership_PartyMunicipalityTerm.rds")




# This next dataset is at the party-municipality-month level, and it will be used for the election-cycle analyses 
# in the appendix (FIGURE Q.1 and FIGURE Q.2)

rm( list=ls() )
member.files <- list.files("../1_data/cleaned/membership_data/")
merged.data <- c()
cat("Aggregating membership data to the municipality-party-month level...\nEstimated time: 10-20 minutes\n")
pb <- txtProgressBar(min = 0, max = length(member.files), style=3, initial = 0) 
progress <- 0
for(file in member.files){
  
  party <- str_remove(str_remove(file, ".rds"), "data_cleaned_filiados_")
  data <- readRDS(paste("../1_data/cleaned/membership_data/", file, sep=""))
  data$PERIODO <- data$Term - 1
  data <- data[data$Term > 2000,]
  
  # Count number of Female and Male recruits who joined during that term, and the number of recruits who ran for office after they joined
  data$Recruits <- 1
  data$Female.Recruits <- as.numeric(data$Gender == "F")
  data$Male.Recruits <- as.numeric(data$Gender == "M")
  
  data$Year.In.Term <- year(data$Joined) %% 4 
  data$Year.In.Term[data$Year.In.Term == 0] <- 4
  data$Month.In.Term <- paste(data$Year.In.Term, ".", str_pad(month(data$Joined), 2, "left", "0"), sep="")
  
  # Create new variables that identify the year
  for(y in 1:4){
    data[, paste("F.", y, sep="")] <- as.numeric(data$Year.In.Term == y & data$Gender == "F")
    data[, paste("M.", y, sep="")] <- as.numeric(data$Year.In.Term == y & data$Gender == "M")
  }
  
  for(y in 1:4){
    for(m in str_pad(1:12, 2, "left", "0")){
      this.month <- paste(y, ".", m, sep="")
      data[, paste("F.", this.month, sep="")] <- as.numeric(data$Month.In.Term == this.month  & data$Gender == "F")
      data[, paste("M.", this.month, sep="")] <- as.numeric(data$Month.In.Term == this.month  & data$Gender == "M")
    }
  }
  
  data <- subset(data, select = -c(ID, Name, Zone, Section, Date, Disaffiliated, Canceled, First.Name, Gender.Prob, Gender, Joined, Term, Switched.Party, Switched.Date,  Recruits, Female.Recruits, Male.Recruits, Year.In.Term, Month.In.Term, Year) )
  ag <- aggregate(. ~ Party + State + MunCode + Municipality + PERIODO, data=data, FUN="sum")
  
  
  ag$Party <- party
  merged.data <- rbind(merged.data, ag)
  rm(data)
  gc(F)
  progress <- progress + 1
  setTxtProgressBar(pb, progress)
}
close(pb)


# Code gender gap for each time period
data <- merged.data
electorate <- readRDS("../1_data/raw/data_raw_electorate.rds")
names(electorate) <- c("PERIODO", "MUNICIPIO", "MunCode", "Voters", "Female.Voters", "Male.Voters")
electorate <- electorate[electorate$PERIODO %in% c(1996, 2000, 2004, 2008, 2012, 2016),]
data <- merge(data, electorate, by=c("PERIODO", "MunCode"))

women.recruit.share <- 1000*data[, grepl("F[.]", names(data))] / data$Female.Voters
men.recruit.share <- 1000*data[, grepl("M[.]", names(data))] / data$Male.Voters
gap <- men.recruit.share - women.recruit.share
names(gap) <- gsub("M[.]", "G.", names(gap))
data <- cbind(data, gap)
saveRDS(data, "../1_data/cleaned/data_cleaned_membership_PartyMunicipalityMonth.rds")

rm( list=ls() )
cat("Finished building the party membership datasets!\n")


